{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 训练数据预处理\n",
    "import numpy as np\n",
    "from sklearn.utils import shuffle\n",
    "import os\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 评论语料目录\n",
    "hotel_feedbacks_dir = \"./hotel_feedbacks_sentiment\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# all_positive和all_negative含有所有的正样本和负样本\n",
    "with open(hotel_feedbacks_dir + \"/\" + \"neg.txt\", \"r\", encoding=\"utf-8\") as f:\n",
    "    all_negative = [line.strip() for line in f.readlines()]\n",
    "with open(hotel_feedbacks_dir + \"/\" + \"pos.txt\", \"r\", encoding=\"utf-8\") as f:\n",
    "    all_positive = [line.strip() for line in f.readlines()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "不错，下次还考虑入住。交通也方便，在餐厅吃的也不错。\n"
     ]
    }
   ],
   "source": [
    "print(all_positive[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取所有文本的长度\n",
    "all_length = [len(i) for i in all_negative] + [len(i) for i in all_positive]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD8CAYAAAB+UHOxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAEfpJREFUeJzt3X+sZHdZx/H3hxbQANItXTfNbvUWXTXlD6BuSo1IlOp22ypbf5EaYldssjGpCUSNLpJYBUlajaJEwVS7cUvQUgXSjVTLWkDiHy3dQin9Qdnbsk272XaXbikQFC08/jHfW6fbe/fO7M7Mvez3/Uomc85zvjPznDNz53PPmTP3pqqQJPXneSvdgCRpZRgAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE6dutINHMsZZ5xRc3NzK92GJH1HufPOO79cVWuXG7eqA2Bubo69e/eudBuS9B0lycOjjPMQkCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdWpVfxP4RM3t+OhI4/ZffcmUO5Gk1WekPYAk+5N8PsldSfa22ulJ9iTZ167XtHqSvCfJfJK7k5w7dD/b2vh9SbZNZ5UkSaMY5xDQT1XVq6pqU5vfAdxaVRuBW9s8wEXAxnbZDrwPBoEBXAW8BjgPuGohNCRJs3cinwFsBXa16V3ApUP162vgNuC0JGcCFwJ7qupIVT0J7AG2nMDjS5JOwKgBUMDHktyZZHurrauqg236MWBdm14PPDJ020dbban6syTZnmRvkr2HDx8esT1J0rhG/RD4tVV1IMn3AnuSfGF4YVVVkppEQ1V1LXAtwKZNmyZyn5Kk5xppD6CqDrTrQ8BHGBzDf7wd2qFdH2rDDwBnDd18Q6stVZckrYBlAyDJi5K8ZGEa2AzcA+wGFs7k2Qbc1KZ3A5e3s4HOB55qh4puATYnWdM+/N3capKkFTDKIaB1wEeSLIz/h6r6tyR3ADcmuQJ4GHhjG38zcDEwD3wDeDNAVR1J8k7gjjbuHVV1ZGJrIkkay7IBUFUPAa9cpP4EcMEi9QKuXOK+dgI7x29TkjRp/ikISeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6NXIAJDklyWeT/EubPzvJ7Unmk3wwyQta/YVtfr4tnxu6j7e1+gNJLpz0ykiSRjfOHsBbgPuH5q8B3l1VPwg8CVzR6lcAT7b6u9s4kpwDXAa8AtgCvDfJKSfWviTpeI0UAEk2AJcAf9fmA7we+Oc2ZBdwaZve2uZpyy9o47cCN1TVN6vqS8A8cN4kVkKSNL5R9wD+Avhd4Ntt/mXAV6rq6Tb/KLC+Ta8HHgFoy59q45+pL3IbSdKMLRsASX4WOFRVd86gH5JsT7I3yd7Dhw/P4iElqUuj7AH8OPCGJPuBGxgc+vlL4LQkp7YxG4ADbfoAcBZAW/5S4Inh+iK3eUZVXVtVm6pq09q1a8deIUnSaJYNgKp6W1VtqKo5Bh/ifryq3gR8AvilNmwbcFOb3t3macs/XlXV6pe1s4TOBjYCn57YmkiSxnLq8kOW9HvADUn+GPgscF2rXwe8P8k8cIRBaFBV9ya5EbgPeBq4sqq+dQKPL0k6AWMFQFV9Evhkm36IRc7iqar/Bn55idu/C3jXuE1KkibPbwJLUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOrVsACT5riSfTvK5JPcm+aNWPzvJ7Unmk3wwyQta/YVtfr4tnxu6r7e1+gNJLpzWSkmSljfKHsA3gddX1SuBVwFbkpwPXAO8u6p+EHgSuKKNvwJ4stXf3caR5BzgMuAVwBbgvUlOmeTKSJJGt2wA1MDX2+zz26WA1wP/3Oq7gEvb9NY2T1t+QZK0+g1V9c2q+hIwD5w3kbWQJI1tpM8AkpyS5C7gELAHeBD4SlU93YY8Cqxv0+uBRwDa8qeAlw3XF7nN8GNtT7I3yd7Dhw+Pv0aSpJGMFABV9a2qehWwgcFv7T8yrYaq6tqq2lRVm9auXTuth5Gk7o11FlBVfQX4BPBjwGlJTm2LNgAH2vQB4CyAtvylwBPD9UVuI0masVHOAlqb5LQ2/d3AzwD3MwiCX2rDtgE3tendbZ62/ONVVa1+WTtL6GxgI/DpSa2IJGk8py4/hDOBXe2MnecBN1bVvyS5D7ghyR8DnwWua+OvA96fZB44wuDMH6rq3iQ3AvcBTwNXVtW3Jrs6kqRRLRsAVXU38OpF6g+xyFk8VfXfwC8vcV/vAt41fpuSpEnzm8CS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUqVH+JeRJb27HR0cat//qS6bciSTNjnsAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnVo2AJKcleQTSe5Lcm+St7T66Un2JNnXrte0epK8J8l8kruTnDt0X9va+H1Jtk1vtSRJyxllD+Bp4Ler6hzgfODKJOcAO4Bbq2ojcGubB7gI2Ngu24H3wSAwgKuA1wDnAVcthIYkafaWDYCqOlhVn2nTXwPuB9YDW4Fdbdgu4NI2vRW4vgZuA05LciZwIbCnqo5U1ZPAHmDLRNdGkjSysT4DSDIHvBq4HVhXVQfboseAdW16PfDI0M0ebbWl6pKkFTByACR5MfAh4K1V9dXhZVVVQE2ioSTbk+xNsvfw4cOTuEtJ0iJGCoAkz2fw5v+BqvpwKz/eDu3Qrg+1+gHgrKGbb2i1perPUlXXVtWmqtq0du3acdZFkjSGUc4CCnAdcH9V/fnQot3Awpk824CbhuqXt7OBzgeeaoeKbgE2J1nTPvzd3GqSpBUwyj+F/3HgV4HPJ7mr1X4fuBq4MckVwMPAG9uym4GLgXngG8CbAarqSJJ3Ane0ce+oqiMTWQtJ0tiWDYCq+k8gSyy+YJHxBVy5xH3tBHaO06AkaTr8JrAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnq1Kkr3cB3krkdHx1p3P6rL5lyJ5J04twDkKROLRsASXYmOZTknqHa6Un2JNnXrte0epK8J8l8kruTnDt0m21t/L4k26azOpKkUY2yB/D3wJajajuAW6tqI3Brmwe4CNjYLtuB98EgMICrgNcA5wFXLYSGJGllLBsAVfUp4MhR5a3Arja9C7h0qH59DdwGnJbkTOBCYE9VHamqJ4E9PDdUJEkzdLyfAayrqoNt+jFgXZteDzwyNO7RVluq/hxJtifZm2Tv4cOHj7M9SdJyTvhD4KoqoCbQy8L9XVtVm6pq09q1ayd1t5KkoxxvADzeDu3Qrg+1+gHgrKFxG1ptqbokaYUcbwDsBhbO5NkG3DRUv7ydDXQ+8FQ7VHQLsDnJmvbh7+ZWkyStkGW/CJbkH4GfBM5I8iiDs3muBm5McgXwMPDGNvxm4GJgHvgG8GaAqjqS5J3AHW3cO6rq6A+WJUkztGwAVNWvLLHogkXGFnDlEvezE9g5VneSpKnxm8CS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdWvavgWp8czs+OtK4/VdfMuVOJGlp7gFIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKn/CbwCvIbw5JWknsAktQpA0CSOmUASFKnDABJ6pQBIEmd8iyg7wCeLSRpGgyAk8ioQQGGhaQVOASUZEuSB5LMJ9kx68eXJA3MdA8gySnAXwM/AzwK3JFkd1XdN8s+5GElSbM/BHQeMF9VDwEkuQHYChgAq9Q4h5VGYaBIq8esA2A98MjQ/KPAa2bcg1bQpANlVAaP9Fyr7kPgJNuB7W3260keOM67OgP48mS6mqjV2hes3t5OuK9cM6FOnuuk3WZTslr7gtXb2/H09f2jDJp1ABwAzhqa39Bqz6iqa4FrT/SBkuytqk0nej+Ttlr7gtXb22rtC1Zvb/Y1vtXa2zT7mvVZQHcAG5OcneQFwGXA7hn3IElixnsAVfV0kt8EbgFOAXZW1b2z7EGSNDDzzwCq6mbg5hk81AkfRpqS1doXrN7eVmtfsHp7s6/xrdbeptZXqmpa9y1JWsX8Y3CS1KmTLgBW+k9NJDkrySeS3Jfk3iRvafU/THIgyV3tcvHQbd7W+n0gyYVT7G1/ks+3x9/baqcn2ZNkX7te0+pJ8p7W191Jzp1STz88tE3uSvLVJG9dqe2VZGeSQ0nuGaqNvY2SbGvj9yXZNqW+/jTJF9pjfyTJaa0+l+S/hrbd3wzd5kfba2C+9Z4p9Tb28zfpn90l+vrgUE/7k9zV6jPbZsd4j5j966yqTpoLgw+WHwReDrwA+Bxwzox7OBM4t02/BPgicA7wh8DvLDL+nNbnC4GzW/+nTKm3/cAZR9X+BNjRpncA17Tpi4F/BQKcD9w+o+fvMQbnMK/I9gJeB5wL3HO82wg4HXioXa9p02um0Ndm4NQ2fc1QX3PD4466n0+3XtN6v2hK22ys528aP7uL9XXU8j8D/mDW2+wY7xEzf52dbHsAz/ypiar6H2DhT03MTFUdrKrPtOmvAfcz+Ab0UrYCN1TVN6vqS8A8g/WYla3Arja9C7h0qH59DdwGnJbkzCn3cgHwYFU9fIwxU91eVfUp4MgijznONroQ2FNVR6rqSWAPsGXSfVXVx6rq6TZ7G4Pv1Syp9fY9VXVbDd5Brh9al4n2dgxLPX8T/9k9Vl/tt/g3Av94rPuYxjY7xnvEzF9nJ1sALPanJo715jtVSeaAVwO3t9Jvtl24nQu7d8y25wI+luTODL5xDbCuqg626ceAdSvQ14LLePYP5EpvrwXjbqOV6PHXGfyWuODsJJ9N8h9JfqLV1rdeZtXXOM/frLfZTwCPV9W+odrMt9lR7xEzf52dbAGwaiR5MfAh4K1V9VXgfcAPAK8CDjLY/Zy111bVucBFwJVJXje8sP2GsyKnhWXwxcA3AP/USqthez3HSm6jpSR5O/A08IFWOgh8X1W9Gvgt4B+SfM+M21qVz9+QX+HZv2zMfJst8h7xjFm9zk62AFj2T03MQpLnM3hiP1BVHwaoqser6ltV9W3gb/n/wxYz67mqDrTrQ8BHWg+PLxzaadeHZt1XcxHwmap6vPW44ttryLjbaGY9Jvk14GeBN7U3DdrhlSfa9J0Mjq3/UOth+DDRNF9r4z5/s9xmpwK/AHxwqN+ZbrPF3iNYgdfZyRYAK/6nJtqxxeuA+6vqz4fqw8fPfx5YODNhN3BZkhcmORvYyOBDp0n39aIkL1mYZvAB4j3t8RfOHtgG3DTU1+XtDITzgaeGdk+n4Vm/ka309jrKuNvoFmBzkjXt0MfmVpuoJFuA3wXeUFXfGKqvzeB/b5Dk5Qy20UOtt68mOb+9Ti8fWpdJ9zbu8zfLn92fBr5QVc8c2pnlNlvqPYKVeJ2dyKfZq/HC4BPzLzJI8LevwOO/lsGu293AXe1yMfB+4POtvhs4c+g2b2/9PsAEzspYoq+XMziz4nPAvQvbBngZcCuwD/h34PRWD4N/3vNg63vTFLfZi4AngJcO1VZkezEIoYPA/zI4pnrF8WwjBsfk59vlzVPqa57BMeCF19nftLG/2J7ju4DPAD83dD+bGLwZPwj8Fe3LoFPobeznb9I/u4v11ep/D/zGUWNnts1Y+j1i5q8zvwksSZ062Q4BSZJGZABIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktSp/wPvnQWEerIpeAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 可视化语料序列长度, 可见大部分文本的长度都在300以下\n",
    "plt.hist(all_length, bins=30)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9346"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(np.array(all_length) < 300)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 把所有的语料放到list里, 每一条语料是一个dict: {\"text\":文本, \"label\":分类}\n",
    "all_data = []\n",
    "for text in all_positive:\n",
    "    all_data.append({\"text\": text, \"label\": 1})\n",
    "for text in all_negative:\n",
    "    all_data.append({\"text\": text, \"label\": 0})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# shuffle打乱顺序\n",
    "all_data = shuffle(all_data, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 拿出5%的数据用来测试\n",
    "test_proportion = 0.05\n",
    "test_idx = int(len(all_data) * test_proportion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 分割训练集和测试集\n",
    "test_data = all_data[:test_idx]\n",
    "train_data = all_data[test_idx:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输出训练集和测试集为txt文件, 每一行为一个dict: {\"text\":文本, \"label\":分类}\n",
    "with open(\"train_sentiment.txt\", \"a\", encoding=\"utf-8\") as f:\n",
    "    for line in train_data:\n",
    "        f.write(str(line))\n",
    "        f.write(\"\\n\")\n",
    "with open(\"test_sentiment.txt\", \"a\", encoding=\"utf-8\") as f:\n",
    "    for line in test_data:\n",
    "        f.write(str(line))\n",
    "        f.write(\"\\n\")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
